Neurosynth electrode word matrix

Download Neurosynth and associated abstracts from Pubmed.


In [1]:
from everything import *

In [2]:
from brede.data.neurosynth import NeurosynthDatabase
from brede.data.pubmed import Pubmed
from brede.data.words import CognitiveWords
from brede.core.matrix import Matrix
from brede.data.sbs2 import SBS2Data

In [3]:
# Log to logfile named 'brede.log'
import logging
logger = logging.getLogger()

file_handler = logging.FileHandler(filename='brede.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel(logging.DEBUG)

In [4]:
pubmed = Pubmed()

In [5]:
# Load Neurosynth
nd = NeurosynthDatabase()
nd_database = nd.database()

In [6]:
# Get abstracts for Neurosynth papers from PubMed
# This will take some hours time 
medlines = pubmed.get_medlines(set(nd_database.id))

In [7]:
# Find keywords in abstracts and add them to a list of list of words
cognitive_words = CognitiveWords()
corpus = []
for n, medline in enumerate(medlines):
    abstract = medline.get('AB', '').lower()
    keywords = cognitive_words.find_all(abstract)
    corpus.append(keywords)
    logger.debug(('Iterating over medline abstracts '
                  'for keyword extraction: {}').format(n))

In [8]:
# Corpus-wide keywords
all_keywords = [word for wordlist in corpus for word in wordlist]
all_unique_keywords = set(all_keywords)

In [9]:
# Build bag-of-phrases matrix
bag_of_phrases = pd.DataFrame(index=[medline['PMID'] for medline in medlines], 
                              columns=list(all_unique_keywords)).fillna(0)
for n, (medline, keywords) in enumerate(zip(medlines, corpus)):
    for keyword in keywords:
        bag_of_phrases.ix[n, keyword] += 1
        if ' ' in keyword:
            keyword_parts = keyword.split()
            for keyword_part in keyword_parts:
                if keyword_part in all_unique_keywords:
                    bag_of_phrases.ix[n, keyword_part] += 1 / len(keyword_parts)
    logger.debug(('Iterating over medline abstracts '
                 'for matrix construction: {}').format(n))

In [10]:
# Scale bag-of-phrases matrix with IDF
scaled = Matrix(bag_of_phrases).idf()

In [11]:
# Read Smartphone Brain Scanner surface
sbs2_data = SBS2Data()
surface = sbs2_data.surface()

In [12]:
grouped = nd_database[['id', 'x', 'y', 'z']].groupby('id')
v = np.zeros((len(grouped), surface.vertices.shape[0]))
sigma = 10
norm1 = 1 / (sigma * math.sqrt(2 * math.pi))
norm2 = -1 / (2 * sigma ** 2)

In [13]:
# Cortexification of study coordinates
for n in range(len(grouped)):
    coords = grouped.get_group(grouped.groups.keys()[n]).ix[:, ['x','y', 'z']]
    p = 0 
    for index, coord in coords.iterrows():
        p += norm1 * np.exp(norm2 * np.sum((surface.vertices - coord.values) ** 2, axis=1))
    p /= math.sqrt(len(coord))
    v[n, :] = p 
    if not n % 100:
        logger.debug(('Iterating over medline abstracts '
                      'for computing Talairach coordinate load: {}').format(n))

In [14]:
product = v.T.dot(scaled)
product_matrix = Matrix(product, columns=bag_of_phrases.columns)
product_matrix.shape


Out[14]:
(1028, 1135)

In [15]:
product_matrix.to_csv('neurosynth electrode word matrix.csv')

In [ ]: